suppressMessages(library(readxl))
suppressMessages(library(plotly))
suppressMessages(library(randomForest))
suppressMessages(library(dplyr))
suppressMessages(library(GGally))
suppressMessages(library(ggplot2))
suppressMessages(library(scatterplot3d))
suppressMessages(library(plm))

########################################################
## Clean Data
##
## Delete the data which height and weight = 'NA' (Invalid BMI data)
########################################################
dat_ex_all <- read_excel("./SharedFiles/ST606/2020/data/Exercise/fit_database_anthropometric_all.xlsx", sheet=1, na='NA')

# Clean data (delete all data that the follow conditions are NA)
dat <- subset(dat_ex_all, `height (cm)` != 'NA' 
                & `weight (kg)` != 'NA')
# Change the date to year
dat$mYear <- substr(as.character(dat$`measurement date`), start=1, stop=4)

# Change the colnames
colnames(dat)[10] <- 'z_category'
colnames(dat)[9] <- 'z_score'
colnames(dat)[2] <- 'm_date'
colnames(dat)[3] <- 'age'
colnames(dat)[4] <- 'age_bin'
colnames(dat)[6] <- 'height'
colnames(dat)[7] <- 'weight'

dataSta <- data.frame(all = nrow(dat_ex_all), obj = nrow(dat), del = nrow(dat_ex_all) - nrow(dat)) 

# new age bin by 0.5 year
dat$new_age_bin = as.factor(ifelse((dat$age - dat$age_bin)>= 0.5,dat$age_bin + 0.5 , dat$age_bin))

# delete error data
dat$indx <- paste(dat$ID, dat$new_age_bin, sep='||')
index <- duplicated(dat[,13])
dat_u <- dat[!index, 1:12]
########################################################
## Show the pie chart
##  data: input data
##  type: 0:by year 1:by age
##  value: condition
##  gender: 0:Girl 1:Boy 2:All
########################################################
showPie <- function(data, type, value, gender) {
  title <- 'The Healthy Ratio'
  # by year
  if (type == 0) {
    title <- paste(title," in ", value)
    # Gender:All
    if (gender == 2) {
      # all 
      nums <-  c(nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$mYear==value ),]),nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$mYear==value ),]),nrow(dat_uc[which(dat_uc$z_category=="overweight"& dat_uc$mYear==value ),]),nrow(dat_uc[which(dat_uc$z_category=="severely thin"& dat_uc$mYear==value ),]),nrow(dat_uc[which(dat_uc$z_category=="thin"& dat_uc$mYear==value ),]))
      drawPie(nums, title)
    }
    # Gender:Boy
    else if (gender == 1) {
      # boy
      title1 <- paste(title," (Boy)")
      nums <-  c(nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$mYear==value & dat_uc$gender=='boy'),]),nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$mYear==value & dat_uc$gender=='boy'),]),nrow(dat_uc[which(dat_uc$z_category=="overweight" & dat_uc$mYear==value & dat_uc$gender=='boy'),]),nrow(dat_uc[which(dat_uc$z_category=="severely thin"& dat_uc$mYear==value & dat_uc$gender=='boy'),]),nrow(dat_uc[which(dat_uc$z_category=="thin"& dat_uc$mYear==value & dat_uc$gender=='boy'),]))
      drawPie(nums, title1)
    } 
    # Gender:Girl
    else if (gender == 0) {
    # girl
      title2 <- paste(title," (Girl)")
      nums <-  c(nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$mYear==value & dat_uc$gender=='girl'),]),nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$mYear==value & dat_uc$gender=='girl'),]),nrow(dat_uc[which(dat_uc$z_category=="overweight" & dat_uc$mYear==value & dat_uc$gender=='girl'),]),nrow(dat_uc[which(dat_uc$z_category=="severely thin"& dat_uc$mYear==value & dat_uc$gender=='girl'),]),nrow(dat_uc[which(dat_uc$z_category=="thin"& dat_uc$mYear==value & dat_uc$gender=='girl'),]))
      drawPie(nums, title2)
    }
  }
  # by age bin
  else {
    title <- paste(title," ", value, " year-old")
    # Gender:All
    if (gender == 2) {
      # all 
      nums <-  c(nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$age_bin==value ),]),nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$age_bin==value ),]),nrow(dat_uc[which(dat_uc$z_category=="overweight"& dat_uc$age_bin==value ),]),nrow(dat_uc[which(dat_uc$z_category=="severely thin"& dat_uc$age_bin==value ),]),nrow(dat_uc[which(dat_uc$z_category=="thin"& dat_uc$age_bin==value ),]))
      drawPie(nums, title)
    } 
    # Gender:Boy
    else if (gender == 1) {
      # boy
      title1 <- paste(title," (Boy)")
      nums <-  c(nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$age_bin==value & dat_uc$gender=='boy'),]),nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$age_bin==value & dat_uc$gender=='boy'),]),nrow(dat_uc[which(dat_uc$z_category=="overweight" & dat_uc$age_bin==value & dat_uc$gender=='boy'),]),nrow(dat_uc[which(dat_uc$z_category=="severely thin"& dat_uc$age_bin==value & dat_uc$gender=='boy'),]),nrow(dat_uc[which(dat_uc$z_category=="thin"& dat_uc$age_bin==value & dat_uc$gender=='boy'),]))
      drawPie(nums, title1)
    } 
    # Gender:Girl
    else if (gender == 0) {
    # girl
      title2 <- paste(title," (Girl)")
      nums <-  c(nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$age_bin==value & dat_uc$gender=='girl'),]),nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$age_bin==value & dat_uc$gender=='girl'),]),nrow(dat_uc[which(dat_uc$z_category=="overweight" & dat_uc$age_bin==value & dat_uc$gender=='girl'),]),nrow(dat_uc[which(dat_uc$z_category=="severely thin"& dat_uc$age_bin==value & dat_uc$gender=='girl'),]),nrow(dat_uc[which(dat_uc$z_category=="thin"& dat_uc$age_bin==value & dat_uc$gender=='girl'),]))
      drawPie(nums, title2)
    }    
  }
}
########################################################
## Show the pie chart
##  data: input data
##  type: 0:by year 1:by age
##  value: condition
##  gender: 0:Girl 1:Boy 2:All
########################################################
getRatio <- function(data, type, gender) {
  pDataNor <- c()
  pDataObe <- c()
  pDataOver <- c()
  pDataSThin <- c()
  pDataThin <- c()
  # by year
  if (type == 0) {
    # Gender:All
    if (gender == 2) {
      # all
      for (value in 2007:2018) {
        norPer <- nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$mYear==value),])/nrow(dat_uc[which(dat_uc$mYear==value),])
        pDataNor <- c(pDataNor, norPer)

        norObe <- nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$mYear==value ),])/nrow(dat_uc[which(dat_uc$mYear==value),])
        pDataObe <- c(pDataObe, norObe)
        
        norOver <- nrow(dat_uc[which(dat_uc$z_category=="overweight" & dat_uc$mYear==value ),])/nrow(dat_uc[which(dat_uc$mYear==value),])
        pDataOver <- c(pDataOver, norOver)
        
        norSThin <- nrow(dat_uc[which(dat_uc$z_category=="severely thin" & dat_uc$mYear==value ),])/nrow(dat_uc[which(dat_uc$mYear==value),])
        pDataSThin <- c(pDataSThin, norSThin)
        
        norThin <- nrow(dat_uc[which(dat_uc$z_category=="thin" & dat_uc$mYear==value ),])/nrow(dat_uc[which(dat_uc$mYear==value),])
        pDataThin <- c(pDataThin, norThin)
      }
      pData <- c(pDataNor, pDataObe, pDataOver, pDataSThin, pDataThin)
      pData
    }
    # Gender:Boy
    else if (gender == 1) {
      # boy
     for (value in 2007:2018) {
        norPer <- nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$mYear==value & dat_uc$gender=='boy'),])/nrow(dat_uc[which(dat_uc$mYear==value & dat_uc$gender=='boy'),])
        pDataNor <- c(pDataNor, norPer)

        norObe <- nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$mYear==value & dat_uc$gender=='boy' ),])/nrow(dat_uc[which(dat_uc$mYear==value & dat_uc$gender=='boy'),])
        pDataObe <- c(pDataObe, norObe)
        
        norOver <- nrow(dat_uc[which(dat_uc$z_category=="overweight" & dat_uc$mYear==value & dat_uc$gender=='boy' ),])/nrow(dat_uc[which(dat_uc$mYear==value & dat_uc$gender=='boy'),])
        pDataOver <- c(pDataOver, norOver)
        
        norSThin <- nrow(dat_uc[which(dat_uc$z_category=="severely thin" & dat_uc$mYear==value & dat_uc$gender=='boy' ),])/nrow(dat_uc[which(dat_uc$mYear==value & dat_uc$gender=='boy'),])
        pDataSThin <- c(pDataSThin, norSThin)
        
        norThin <- nrow(dat_uc[which(dat_uc$z_category=="thin" & dat_uc$mYear==value & dat_uc$gender=='boy' ),])/nrow(dat_uc[which(dat_uc$mYear==value & dat_uc$gender=='boy'),])
        pDataThin <- c(pDataThin, norThin)
      }
      pData <- c(pDataNor, pDataObe, pDataOver, pDataSThin, pDataThin)
      pData
    } 
    # Gender:Girl
    else if (gender == 0) {
    # girl
     for (value in 2007:2018) {
        norPer <- nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$mYear==value & dat_uc$gender=='girl'),])/nrow(dat_uc[which(dat_uc$mYear==value & dat_uc$gender=='girl'),])
        pDataNor <- c(pDataNor, norPer)

        norObe <- nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$mYear==value & dat_uc$gender=='girl' ),])/nrow(dat_uc[which(dat_uc$mYear==value & dat_uc$gender=='girl'),])
        pDataObe <- c(pDataObe, norObe)
        
        norOver <- nrow(dat_uc[which(dat_uc$z_category=="overweight" & dat_uc$mYear==value & dat_uc$gender=='girl' ),])/nrow(dat_uc[which(dat_uc$mYear==value & dat_uc$gender=='girl'),])
        pDataOver <- c(pDataOver, norOver)
        
        norSThin <- nrow(dat_uc[which(dat_uc$z_category=="severely thin" & dat_uc$mYear==value & dat_uc$gender=='girl' ),])/nrow(dat_uc[which(dat_uc$mYear==value & dat_uc$gender=='girl'),])
        pDataSThin <- c(pDataSThin, norSThin)
        
        norThin <- nrow(dat_uc[which(dat_uc$z_category=="thin" & dat_uc$mYear==value & dat_uc$gender=='girl' ),])/nrow(dat_uc[which(dat_uc$mYear==value & dat_uc$gender=='girl'),])
        pDataThin <- c(pDataThin, norThin)
      }
      pData <- c(pDataNor, pDataObe, pDataOver, pDataSThin, pDataThin)
      pData
    }
  }
  # by age bin
  else {
    # Gender:All
    if (gender == 2) {
      # all 
      for (value in 6:18) {
        norPer <- nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$age_bin==value),])/nrow(dat_uc[which(dat_uc$age_bin==value),])
        pDataNor <- c(pDataNor, norPer)

        norObe <- nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$age_bin==value ),])/nrow(dat_uc[which(dat_uc$age_bin==value),])
        pDataObe <- c(pDataObe, norObe)
        
        norOver <- nrow(dat_uc[which(dat_uc$z_category=="overweight" & dat_uc$age_bin==value ),])/nrow(dat_uc[which(dat_uc$age_bin==value),])
        pDataOver <- c(pDataOver, norOver)
        
        norSThin <- nrow(dat_uc[which(dat_uc$z_category=="severely thin" & dat_uc$age_bin==value ),])/nrow(dat_uc[which(dat_uc$age_bin==value),])
        pDataSThin <- c(pDataSThin, norSThin)
        
        norThin <- nrow(dat_uc[which(dat_uc$z_category=="thin" & dat_uc$age_bin==value ),])/nrow(dat_uc[which(dat_uc$age_bin==value),])
        pDataThin <- c(pDataThin, norThin)
      }
      pData <- c(pDataNor, pDataObe, pDataOver, pDataSThin, pDataThin)
      pData
    } 
    # Gender:Boy
    else if (gender == 1) {
      # boy
      for (value in 6:18) {
        norPer <- nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$age_bin==value & dat_uc$gender=='boy'),])/nrow(dat_uc[which(dat_uc$age_bin==value & dat_uc$gender=='boy'),])
        pDataNor <- c(pDataNor, norPer)

        norObe <- nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$age_bin==value & dat_uc$gender=='boy' ),])/nrow(dat_uc[which(dat_uc$age_bin==value & dat_uc$gender=='boy'),])
        pDataObe <- c(pDataObe, norObe)
        
        norOver <- nrow(dat_uc[which(dat_uc$z_category=="overweight" & dat_uc$age_bin==value & dat_uc$gender=='boy' ),])/nrow(dat_uc[which(dat_uc$age_bin==value & dat_uc$gender=='boy'),])
        pDataOver <- c(pDataOver, norOver)
        
        norSThin <- nrow(dat_uc[which(dat_uc$z_category=="severely thin" & dat_uc$age_bin==value & dat_uc$gender=='boy' ),])/nrow(dat_uc[which(dat_uc$age_bin==value & dat_uc$gender=='boy'),])
        pDataSThin <- c(pDataSThin, norSThin)
        
        norThin <- nrow(dat_uc[which(dat_uc$z_category=="thin" & dat_uc$age_bin==value & dat_uc$gender=='boy' ),])/nrow(dat_uc[which(dat_uc$age_bin==value & dat_uc$gender=='boy'),])
        pDataThin <- c(pDataThin, norThin)
      }
      pData <- c(pDataNor, pDataObe, pDataOver, pDataSThin, pDataThin)
      pData
    } 
    # Gender:Girl
    else if (gender == 0) {
    # girl
      for (value in 6:18) {
        norPer <- nrow(dat_uc[which(dat_uc$z_category=="normal" & dat_uc$age_bin==value & dat_uc$gender=='girl'),])/nrow(dat_uc[which(dat_uc$age_bin==value & dat_uc$gender=='girl'),])
        pDataNor <- c(pDataNor, norPer)

        norObe <- nrow(dat_uc[which(dat_uc$z_category=="obese" & dat_uc$age_bin==value & dat_uc$gender=='girl' ),])/nrow(dat_uc[which(dat_uc$age_bin==value & dat_uc$gender=='girl'),])
        pDataObe <- c(pDataObe, norObe)
        
        norOver <- nrow(dat_uc[which(dat_uc$z_category=="overweight" & dat_uc$age_bin==value & dat_uc$gender=='girl' ),])/nrow(dat_uc[which(dat_uc$age_bin==value & dat_uc$gender=='girl'),])
        pDataOver <- c(pDataOver, norOver)
        
        norSThin <- nrow(dat_uc[which(dat_uc$z_category=="severely thin" & dat_uc$age_bin==value & dat_uc$gender=='girl' ),])/nrow(dat_uc[which(dat_uc$age_bin==value & dat_uc$gender=='girl'),])
        pDataSThin <- c(pDataSThin, norSThin)
        
        norThin <- nrow(dat_uc[which(dat_uc$z_category=="thin" & dat_uc$age_bin==value & dat_uc$gender=='girl' ),])/nrow(dat_uc[which(dat_uc$age_bin==value & dat_uc$gender=='girl'),])
        pDataThin <- c(pDataThin, norThin)
      }
      pData <- c(pDataNor, pDataObe, pDataOver, pDataSThin, pDataThin)
      pData
    }    
  }
}
########################################################
## Draw the pie chart
##  nums: count by category
##  title: title
########################################################
drawPie <- function(nums, title){
  # Create Data
  data <- data.frame(
    Category=c('normal','obese','overweight','severely thin', 'thin' ),
    value=nums
  )
  
  # Edit label 
  label_value <- paste('(', round(data$value/sum(data$value) * 100, 1), '%)', sep = '')
  label <- paste(data$Category, label_value, sep = '')
  
  # Basic piechart
  ggplot(data, aes(x="", y=value, fill=Category)) +
    geom_bar(stat="identity", width=1, color="white") +
    theme_void() + coord_polar(theta = 'y') + labs(x = '', y = '', title = '') + theme(axis.text = element_blank()) + theme(axis.ticks = element_blank()) + scale_fill_discrete(labels = label) +
    ggtitle(title)
}
########################################################
## Show trend chart
##  data: input data
##  type: 0:by year 1:by age
##  gender: 0:Girl 1:Boy 2:All
########################################################
showTrend <- function(data, ty, gender) {
  if (ty == 0) {
    title <- 'The trend of Healthy Ratio 2007~2018'
    if (gender == 1) {
      title <- paste(title, " (Boy)")
    } else if (gender == 0) {
      title <- paste(title, " (Girl)")
    }
    Year <- rep(2007:2018, times = 5)
    type <- rep(c('normal','obese','overweight','severely thin', 'thin'), each = 12)
    Proportion <- getRatio(data, ty, gender)
    df <- data.frame(year = Year, type = type, proportion = Proportion)
    ggplot(data = df, mapping = aes(x = Year, y = Proportion, colour = type)) + geom_line()+
        ggtitle(title)
  } else {
    title <- 'The trend of Healthy Ratio 6~18 year-old'
    if (gender == 1) {
      title <- paste(title, " (Boy)")
    } else if (gender == 0) {
      title <- paste(title, " (Girl)")
    }
    Age <- rep(6:18, times = 5)
    type <- rep(c('normal','obese','overweight','severely thin', 'thin'), each = 13)
    Proportion <- getRatio(data, ty, gender)
    df <- data.frame(Age = Age, type = type, proportion = Proportion)
    ggplot(data = df, mapping = aes(x = Age, y = Proportion, colour = type)) + geom_line()+
        ggtitle(title)
  }
}
########################################################
## z_category by age
##
## the percentage of different age
########################################################
# ALL
dat_u$z_category = as.factor(dat_u$z_category)
dat_uc <- dat_u
dat_uc$count <- rep(1, times = nrow(dat_uc))

# Show the pie by age
showPie(dat_uc, 1, 6, 2)

showPie(dat_uc, 1, 6, 1)

showPie(dat_uc, 1, 6, 0)

showPie(dat_uc, 1, 7, 2)

showPie(dat_uc, 1, 7, 1)

showPie(dat_uc, 1, 7, 0)

showPie(dat_uc, 1, 8, 2)

showPie(dat_uc, 1, 8, 1)

showPie(dat_uc, 1, 8, 0)

showPie(dat_uc, 1, 9, 2)

showPie(dat_uc, 1, 9, 1)

showPie(dat_uc, 1, 9, 0)

showPie(dat_uc, 1, 10, 2)

showPie(dat_uc, 1, 10, 1)

showPie(dat_uc, 1, 10, 0)

showPie(dat_uc, 1, 11, 2)

showPie(dat_uc, 1, 11, 1)

showPie(dat_uc, 1, 11, 0)

showPie(dat_uc, 1, 12, 2)

showPie(dat_uc, 1, 12, 1)

showPie(dat_uc, 1, 12, 0)

showPie(dat_uc, 1, 13, 2)

showPie(dat_uc, 1, 13, 1)

showPie(dat_uc, 1, 13, 0)

showPie(dat_uc, 1, 14, 2)

showPie(dat_uc, 1, 14, 1)

showPie(dat_uc, 1, 14, 0)

showPie(dat_uc, 1, 15, 2)

showPie(dat_uc, 1, 15, 1)

showPie(dat_uc, 1, 15, 0)

showPie(dat_uc, 1, 16, 2)

showPie(dat_uc, 1, 16, 1)

showPie(dat_uc, 1, 16, 0)

showPie(dat_uc, 1, 17, 2)

showPie(dat_uc, 1, 17, 1)

showPie(dat_uc, 1, 17, 0)

showPie(dat_uc, 1, 18, 2)

showPie(dat_uc, 1, 18, 1)

showPie(dat_uc, 1, 18, 0)

########################################################
## z_category By year
##
## the percentage of different year
########################################################
# Show the pie by year
showPie(dat_uc, 0, '2007', 2)

showPie(dat_uc, 0, '2007', 1)

showPie(dat_uc, 0, '2007', 0)

showPie(dat_uc, 0, '2008', 2)

showPie(dat_uc, 0, '2008', 1)

showPie(dat_uc, 0, '2008', 0)

showPie(dat_uc, 0, '2009', 2)

showPie(dat_uc, 0, '2009', 1)

showPie(dat_uc, 0, '2009', 0)

showPie(dat_uc, 0, '2010', 2)

showPie(dat_uc, 0, '2010', 1)

showPie(dat_uc, 0, '2010', 0)

showPie(dat_uc, 0, '2011', 2)

showPie(dat_uc, 0, '2011', 1)

showPie(dat_uc, 0, '2011', 0)

showPie(dat_uc, 0, '2012', 2)

showPie(dat_uc, 0, '2012', 1)

showPie(dat_uc, 0, '2012', 0)

showPie(dat_uc, 0, '2013', 2)

showPie(dat_uc, 0, '2013', 1)

showPie(dat_uc, 0, '2013', 0)

showPie(dat_uc, 0, '2014', 2)

showPie(dat_uc, 0, '2014', 1)

showPie(dat_uc, 0, '2014', 0)

showPie(dat_uc, 0, '2015', 2)

showPie(dat_uc, 0, '2015', 1)

showPie(dat_uc, 0, '2015', 0)

showPie(dat_uc, 0, '2016', 2)

showPie(dat_uc, 0, '2016', 1)

showPie(dat_uc, 0, '2016', 0)

showPie(dat_uc, 0, '2017', 2)

showPie(dat_uc, 0, '2017', 1)

showPie(dat_uc, 0, '2017', 0)

showPie(dat_uc, 0, '2018', 2)

showPie(dat_uc, 0, '2018', 1)

showPie(dat_uc, 0, '2018', 0)

########################################################
## z_category By Year
##
## the trend
########################################################
# All
showTrend(dat_uc, 0, 2)

# Boy
showTrend(dat_uc, 0, 1)

# Girl
showTrend(dat_uc, 0, 0)

########################################################
## z_category By age
##
## the trend
########################################################
# By Age
# All
showTrend(dat_uc, 1, 2)

# Boy
showTrend(dat_uc, 1, 1)

# Girl
showTrend(dat_uc, 1, 0)

########################################################
## Z-category ~ Height + Weight + age + gender
## 
## cluster relationship
########################################################
ggplot(data = dat_uc) + geom_point(mapping = aes(x = weight, y = height, color=z_category))

dat_boy <- dat_uc[which(dat_uc$gender=='boy' ),]
fig_boy <- plot_ly(dat_boy, x=~`weight`, y = ~`height`, z = ~`age`, color = ~`z_category`)
fig_boy <- fig_boy %>% add_markers()
fig_boy
dat_girl <- dat_uc[which(dat_uc$gender=='girl' ),]
fig_girl <- plot_ly(dat_girl, x=~`weight`, y = ~`height`, z = ~`age`, color = ~`z_category`)
fig_girl <- fig_girl %>% add_markers()
fig_girl
########################################################
## Predict Z-category ~ height + weight + age + gender (Machine Learning)
##  
## Sample 80% as training data
## Sample 20% as test data
## Use randomForest algorithm to learn the function
## then use the test data to predict the z-category and compare the actual result to chect the funtion
########################################################
dat_learn <- dat_uc

## change gender to number (boy:1, girl:0)
dat_learn$gender[dat_learn$gender == "boy"] <- 1
dat_learn$gender[dat_learn$gender == "girl"] <- 0
dat_learn$gender <- as.numeric(dat_learn$gender)

## measurement date,age (years), age bin have strong Collinearity. So I only remain the age(years)
dat_learn <- cbind(dat_learn[, 3], dat_learn[, 5:7], dat_learn[, 10]) 
names(dat_learn)[1:4]<-c("age", "gender", "height", "weight")

## sampling 80% for training, 20% for test.
ind <- sample(c(1,2), nrow(dat_learn), replace=T, prob = c(0.8, 0.2))
## training data
trainData <- dat_learn[ind == 1,]
## test data
testData <- dat_learn[ind == 2,]

set.seed(500)
ez.forest <- randomForest(z_category ~., data = trainData,
                          na.action = na.roughfix,
                          importance = TRUE)
ez.forest
## 
## Call:
##  randomForest(formula = z_category ~ ., data = trainData, importance = TRUE,      na.action = na.roughfix) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 2.08%
## Confusion matrix:
##               normal obese overweight severely thin thin class.error
## normal         51916     0        276             0   60 0.006430376
## obese              0  8862        256             0    0 0.028076333
## overweight       408   211      11869             0    0 0.049567585
## severely thin      3     0          0            71  104 0.601123596
## thin             245     0          0             7 1357 0.156619018
importance(ez.forest,type=2)
##        MeanDecreaseGini
## age           6555.2970
## gender         840.4034
## height        8611.3377
## weight       19668.8672
forest.pred <- predict(ez.forest, testData)
forest.perf <- table(testData$z_category, forest.pred,
                     dnn = c('Actual', 'Predicted'))
forest.perf
##                Predicted
## Actual          normal obese overweight severely thin  thin
##   normal         12984     0         67             0    19
##   obese              0  2193         67             0     0
##   overweight        96    51       3068             0     0
##   severely thin      0     0          0            17    26
##   thin              61     0          0             1   379
########################################################
## Predict Z-Score ~ age + gender (Long unbalanced panel data)
##   Way1: predict the z_score at 18(18.5) year
##   Way2: predict the unbalanced z_score
## plm
########################################################
dat_unq <- dat_uc
dat_unq$last_rs = rep(NA, times = nrow(dat_unq))
bId <- 0
z18 <- 0
for (i in 1:nrow(dat_unq)) {
  id <- dat_unq[i,1]$ID
  if (id == bId) {
    if (nrow(z18) != 0) {
      dat_unq$last_rs[i] = z18$z_score
    }
  }
  else {
    z18 <- dat_unq[which(dat_unq$ID==id & dat_unq$new_age_bin == '18.5'), 'z_score']
    if (nrow(z18) == 0) {
      z18 <- dat_unq[which(dat_unq$ID==id & dat_unq$new_age_bin == '18'), 'z_score']
    }
    if (nrow(z18) != 0) {
      dat_unq$last_rs[i] = z18$z_score
    } else {
      dat_unq$last_rs[i] = NA
      z18$z_score <- NA
    }
    bId <- id
  }
}

dat_Last <- dat_unq[which(is.na(dat_unq$last_rs) == FALSE ), ]

fitPlm1 <- plm(z_score~ gender + new_age_bin, data=dat_unq, model = 'random', index=c("ID", "new_age_bin"))
summary(fitPlm1)
## Oneway (individual) effect Random Effect Model 
##    (Swamy-Arora's transformation)
## 
## Call:
## plm(formula = z_score ~ gender + new_age_bin, data = dat_unq, 
##     model = "random", index = c("ID", "new_age_bin"))
## 
## Unbalanced Panel: n = 13540, T = 1-22, N = 94674
## 
## Effects:
##                  var std.dev share
## idiosyncratic 0.2061  0.4540 0.123
## individual    1.4669  1.2112 0.877
## theta:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.6490  0.8487  0.8823  0.8698  0.9037  0.9203 
## 
## Residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -4.2921 -0.2651  0.0064 -0.0003  0.2752  4.4151 
## 
## Coefficients:
##                   Estimate Std. Error z-value  Pr(>|z|)    
## (Intercept)      0.2753754  0.0356370  7.7272 1.099e-14 ***
## gendergirl      -0.0470912  0.0212205 -2.2191 0.0264773 *  
## new_age_bin6.5   0.0351477  0.0337905  1.0402 0.2982638    
## new_age_bin7     0.0680335  0.0331229  2.0540 0.0399788 *  
## new_age_bin7.5   0.0927227  0.0330026  2.8096 0.0049610 ** 
## new_age_bin8     0.1325888  0.0330846  4.0076 6.135e-05 ***
## new_age_bin8.5   0.1645342  0.0330953  4.9715 6.643e-07 ***
## new_age_bin9     0.1975643  0.0331715  5.9559 2.587e-09 ***
## new_age_bin9.5   0.1943423  0.0332092  5.8521 4.855e-09 ***
## new_age_bin10    0.1947005  0.0332475  5.8561 4.739e-09 ***
## new_age_bin10.5  0.1968568  0.0332809  5.9150 3.319e-09 ***
## new_age_bin11    0.1981328  0.0333163  5.9470 2.731e-09 ***
## new_age_bin11.5  0.1926030  0.0333386  5.7772 7.596e-09 ***
## new_age_bin12    0.1852653  0.0333395  5.5569 2.746e-08 ***
## new_age_bin12.5  0.1891827  0.0333541  5.6719 1.412e-08 ***
## new_age_bin13    0.1727504  0.0333721  5.1765 2.261e-07 ***
## new_age_bin13.5  0.1479326  0.0334039  4.4286 9.485e-06 ***
## new_age_bin14    0.1141124  0.0334420  3.4122 0.0006443 ***
## new_age_bin14.5  0.0626004  0.0334585  1.8710 0.0613467 .  
## new_age_bin15    0.0269095  0.0334964  0.8034 0.4217699    
## new_age_bin15.5 -0.0069914  0.0335210 -0.2086 0.8347853    
## new_age_bin16   -0.0203785  0.0335312 -0.6077 0.5433547    
## new_age_bin16.5 -0.0253267  0.0335797 -0.7542 0.4507137    
## new_age_bin17   -0.0390515  0.0336446 -1.1607 0.2457615    
## new_age_bin17.5 -0.0595620  0.0337190 -1.7664 0.0773254 .  
## new_age_bin18   -0.0522465  0.0338587 -1.5431 0.1228129    
## new_age_bin18.5 -0.0573948  0.0344423 -1.6664 0.0956333 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    19924
## Residual Sum of Squares: 19504
## R-Squared:      0.021062
## Adj. R-Squared: 0.020793
## Chisq: 2036.32 on 26 DF, p-value: < 2.22e-16
fitPlm2 <- plm(z_score~ gender + new_age_bin, data=dat_unq, model = "within", index=c("ID", "new_age_bin"))
summary(fitPlm2)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = z_score ~ gender + new_age_bin, data = dat_unq, 
##     model = "within", index = c("ID", "new_age_bin"))
## 
## Unbalanced Panel: n = 13540, T = 1-22, N = 94674
## 
## Residuals:
##     Min.  1st Qu.   Median  3rd Qu.     Max. 
## -4.28995 -0.22002  0.00000  0.22726  4.64759 
## 
## Coefficients:
##                  Estimate Std. Error t-value  Pr(>|t|)    
## new_age_bin6.5   0.037084   0.034047  1.0892 0.2760636    
## new_age_bin7     0.070264   0.033417  2.1026 0.0355039 *  
## new_age_bin7.5   0.093949   0.033306  2.8207 0.0047926 ** 
## new_age_bin8     0.133462   0.033395  3.9964 6.436e-05 ***
## new_age_bin8.5   0.164771   0.033409  4.9319 8.158e-07 ***
## new_age_bin9     0.198309   0.033490  5.9213 3.206e-09 ***
## new_age_bin9.5   0.193785   0.033531  5.7793 7.528e-09 ***
## new_age_bin10    0.193976   0.033572  5.7779 7.589e-09 ***
## new_age_bin10.5  0.196426   0.033608  5.8447 5.095e-09 ***
## new_age_bin11    0.198331   0.033645  5.8947 3.767e-09 ***
## new_age_bin11.5  0.192948   0.033670  5.7306 1.004e-08 ***
## new_age_bin12    0.185672   0.033673  5.5140 3.518e-08 ***
## new_age_bin12.5  0.188918   0.033689  5.6076 2.058e-08 ***
## new_age_bin13    0.172058   0.033710  5.1040 3.332e-07 ***
## new_age_bin13.5  0.147179   0.033744  4.3616 1.293e-05 ***
## new_age_bin14    0.113281   0.033786  3.3529 0.0008002 ***
## new_age_bin14.5  0.059502   0.033810  1.7599 0.0784308 .  
## new_age_bin15    0.021525   0.033859  0.6357 0.5249447    
## new_age_bin15.5 -0.011586   0.033888 -0.3419 0.7324238    
## new_age_bin16   -0.023808   0.033900 -0.7023 0.4824844    
## new_age_bin16.5 -0.027785   0.033950 -0.8184 0.4131234    
## new_age_bin17   -0.040330   0.034017 -1.1856 0.2357924    
## new_age_bin17.5 -0.060214   0.034095 -1.7661 0.0773894 .  
## new_age_bin18   -0.053063   0.034240 -1.5498 0.1212045    
## new_age_bin18.5 -0.060139   0.034841 -1.7261 0.0843290 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    17120
## Residual Sum of Squares: 16720
## R-Squared:      0.023339
## Adj. R-Squared: -0.13999
## F-statistic: 77.5311 on 25 and 81109 DF, p-value: < 2.22e-16
fitPlm3 <- plm(last_rs~ gender + z_score + new_age_bin, data=dat_Last, model = 'pooling', effect = "twoways", index=c("ID", "new_age_bin"))
summary(fitPlm3)
## Pooling Model
## 
## Call:
## plm(formula = last_rs ~ gender + z_score + new_age_bin, data = dat_Last, 
##     effect = "twoways", model = "pooling", index = c("ID", "new_age_bin"))
## 
## Unbalanced Panel: n = 3822, T = 1-22, N = 30915
## 
## Residuals:
##      Min.   1st Qu.    Median   3rd Qu.      Max. 
## -4.033535 -0.296469 -0.012373  0.304697  4.639339 
## 
## Coefficients:
##                   Estimate Std. Error  t-value  Pr(>|t|)    
## (Intercept)     -0.1154118  0.0679273  -1.6990   0.08932 .  
## gendergirl       0.0354422  0.0068619   5.1651 2.418e-07 ***
## z_score          0.8731876  0.0027568 316.7387 < 2.2e-16 ***
## new_age_bin8    -0.0900486  0.0822225  -1.0952   0.27345    
## new_age_bin8.5  -0.0849295  0.0766365  -1.1082   0.26778    
## new_age_bin9    -0.0779625  0.0747532  -1.0429   0.29699    
## new_age_bin9.5  -0.0595303  0.0733562  -0.8115   0.41707    
## new_age_bin10   -0.0860052  0.0724358  -1.1873   0.23511    
## new_age_bin10.5 -0.0852668  0.0715346  -1.1920   0.23328    
## new_age_bin11   -0.1063984  0.0710718  -1.4971   0.13439    
## new_age_bin11.5 -0.1137045  0.0707826  -1.6064   0.10820    
## new_age_bin12   -0.1191120  0.0704488  -1.6908   0.09089 .  
## new_age_bin12.5 -0.1127691  0.0701931  -1.6066   0.10816    
## new_age_bin13   -0.0901450  0.0699564  -1.2886   0.19755    
## new_age_bin13.5 -0.0745277  0.0699142  -1.0660   0.28644    
## new_age_bin14   -0.0474844  0.0698774  -0.6795   0.49680    
## new_age_bin14.5  0.0228357  0.0696322   0.3279   0.74295    
## new_age_bin15    0.0641466  0.0693658   0.9248   0.35510    
## new_age_bin15.5  0.0780343  0.0692040   1.1276   0.25950    
## new_age_bin16    0.0896932  0.0690477   1.2990   0.19395    
## new_age_bin16.5  0.0987755  0.0689407   1.4328   0.15194    
## new_age_bin17    0.1129036  0.0688545   1.6397   0.10107    
## new_age_bin17.5  0.1182883  0.0687366   1.7209   0.08528 .  
## new_age_bin18    0.1157509  0.0686008   1.6873   0.09155 .  
## new_age_bin18.5  0.1207992  0.0689606   1.7517   0.07983 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    47703
## Residual Sum of Squares: 11221
## R-Squared:      0.76477
## Adj. R-Squared: 0.76459
## F-statistic: 4184.55 on 24 and 30890 DF, p-value: < 2.22e-16